In [1]:
pip install openpyxl
Requirement already satisfied: openpyxl in c:\users\sajja\anaconda3\envs\dab200\lib\site-packages (3.1.2)
Requirement already satisfied: et-xmlfile in c:\users\sajja\anaconda3\envs\dab200\lib\site-packages (from openpyxl) (1.1.0)
Note: you may need to restart the kernel to use updated packages.
In [2]:
pip install plotly
Collecting plotly
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
     ---------------------------------------- 0.0/15.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/15.6 MB ? eta -:--:--
     --------------------------------------- 0.0/15.6 MB 487.6 kB/s eta 0:00:32
      --------------------------------------- 0.4/15.6 MB 3.3 MB/s eta 0:00:05
     -- ------------------------------------- 0.9/15.6 MB 5.6 MB/s eta 0:00:03
     ---- ----------------------------------- 1.6/15.6 MB 7.9 MB/s eta 0:00:02
     ------ --------------------------------- 2.4/15.6 MB 9.6 MB/s eta 0:00:02
     -------- ------------------------------- 3.4/15.6 MB 11.3 MB/s eta 0:00:02
     ----------- ---------------------------- 4.3/15.6 MB 12.6 MB/s eta 0:00:01
     ------------- -------------------------- 5.4/15.6 MB 13.9 MB/s eta 0:00:01
     ---------------- ----------------------- 6.6/15.6 MB 15.6 MB/s eta 0:00:01
     -------------------- ------------------- 7.9/15.6 MB 16.2 MB/s eta 0:00:01
     ----------------------- ---------------- 9.2/15.6 MB 17.2 MB/s eta 0:00:01
     -------------------------- ------------ 10.5/15.6 MB 21.8 MB/s eta 0:00:01
     ----------------------------- --------- 11.8/15.6 MB 25.1 MB/s eta 0:00:01
     -------------------------------- ------ 13.2/15.6 MB 27.3 MB/s eta 0:00:01
     ------------------------------------ -- 14.5/15.6 MB 28.5 MB/s eta 0:00:01
     --------------------------------------  15.6/15.6 MB 28.5 MB/s eta 0:00:01
     --------------------------------------  15.6/15.6 MB 28.5 MB/s eta 0:00:01
     --------------------------------------- 15.6/15.6 MB 24.2 MB/s eta 0:00:00
Requirement already satisfied: packaging in c:\users\sajja\anaconda3\envs\dab200\lib\site-packages (from plotly) (23.0)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.18.0 tenacity-8.2.3
Note: you may need to restart the kernel to use updated packages.
In [3]:
# Import the Pandas library
import pandas as pd

# Load your dataset 
df = pd.read_csv('MonthlyRetailTradeE-commerceSales(x1,000)_012017To082023.csv')


print(df.head())
  REF_DATE     GEO           DGUID  \
0  2017-01  Canada  2016A000011124   
1  2017-01  Canada  2016A000011124   
2  2017-01  Canada  2016A000011124   
3  2017-01  Canada  2016A000011124   
4  2017-01  Canada  2016A000011124   

  North American Industry Classification System (NAICS)  \
0                               Retail trade [44-45]      
1                               Retail trade [44-45]      
2                               Retail trade [44-45]      
3                               Retail trade [44-45]      
4              Motor vehicle and parts dealers [441]      

                     Sales          Adjustments      UOM  UOM_ID  \
0       Total retail sales           Unadjusted  Dollars      81   
1       Total retail sales  Seasonally adjusted  Dollars      81   
2  Retail e-commerce sales           Unadjusted  Dollars      81   
3  Retail e-commerce sales  Seasonally adjusted  Dollars      81   
4       Total retail sales           Unadjusted  Dollars      81   

  SCALAR_FACTOR  SCALAR_ID       VECTOR COORDINATE       VALUE STATUS  SYMBOL  \
0     thousands          3  v1446859481    1.1.1.1  41377009.0    NaN     NaN   
1     thousands          3  v1446859483    1.1.1.2  50417235.0    NaN     NaN   
2     thousands          3  v1446859482    1.1.2.1   1110045.0    NaN     NaN   
3     thousands          3  v1446859484    1.1.2.2   1236885.0    NaN     NaN   
4     thousands          3  v1446859485    1.2.1.1  10162441.0    NaN     NaN   

   TERMINATED  DECIMALS  
0         NaN         0  
1         NaN         0  
2         NaN         0  
3         NaN         0  
4         NaN         0  
In [4]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('MonthlyRetailTradeE-commerceSales(x1,000)_012017To082023.csv')

# Check for missing values
missing_values = df.isnull().sum()
missing_values
Out[4]:
REF_DATE                                                     0
GEO                                                          0
DGUID                                                        0
North American Industry Classification System (NAICS)        0
Sales                                                        0
Adjustments                                                  0
UOM                                                          0
UOM_ID                                                       0
SCALAR_FACTOR                                                0
SCALAR_ID                                                    0
VECTOR                                                       0
COORDINATE                                                   0
VALUE                                                     7735
STATUS                                                   40341
SYMBOL                                                   54342
TERMINATED                                               54342
DECIMALS                                                     0
dtype: int64
In [5]:
import pandas as pd

# Remove rows with missing 'VALUE' entries
df_cleaned = df.dropna(subset=['VALUE'])

# Verify the shape of the cleaned dataset
print("Shape of cleaned dataset:", df_cleaned.shape)
Shape of cleaned dataset: (46607, 17)
In [6]:
# Assuming your cleaned DataFrame is named 'df_cleaned' and you want to save it as a CSV file
#df_cleaned.to_csv('MonthlyRetailTradeE-commerceSales(x1,000)_012017To082023_cleaned_dataset.csv', index=False)
In [7]:
df_cleaned.to_excel('MonthlyRetailTradeE-commerceSales_012017To082023_cleaned_dataset.xlsx', index=False)
In [8]:
df_cleaned.head()
Out[8]:
REF_DATE GEO DGUID North American Industry Classification System (NAICS) Sales Adjustments UOM UOM_ID SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS
0 2017-01 Canada 2016A000011124 Retail trade [44-45] Total retail sales Unadjusted Dollars 81 thousands 3 v1446859481 1.1.1.1 41377009.0 NaN NaN NaN 0
1 2017-01 Canada 2016A000011124 Retail trade [44-45] Total retail sales Seasonally adjusted Dollars 81 thousands 3 v1446859483 1.1.1.2 50417235.0 NaN NaN NaN 0
2 2017-01 Canada 2016A000011124 Retail trade [44-45] Retail e-commerce sales Unadjusted Dollars 81 thousands 3 v1446859482 1.1.2.1 1110045.0 NaN NaN NaN 0
3 2017-01 Canada 2016A000011124 Retail trade [44-45] Retail e-commerce sales Seasonally adjusted Dollars 81 thousands 3 v1446859484 1.1.2.2 1236885.0 NaN NaN NaN 0
4 2017-01 Canada 2016A000011124 Motor vehicle and parts dealers [441] Total retail sales Unadjusted Dollars 81 thousands 3 v1446859485 1.2.1.1 10162441.0 NaN NaN NaN 0
In [9]:
# Get the summary statistics
print(df_cleaned.describe())
        UOM_ID  SCALAR_ID         VALUE  SYMBOL  TERMINATED  DECIMALS
count  46607.0    46607.0  4.660700e+04     0.0         0.0   46607.0
mean      81.0        3.0  1.171890e+06     NaN         NaN       0.0
std        0.0        0.0  4.122422e+06     NaN         NaN       0.0
min       81.0        3.0  1.000000e+00     NaN         NaN       0.0
25%       81.0        3.0  3.806250e+04     NaN         NaN       0.0
50%       81.0        3.0  1.677500e+05     NaN         NaN       0.0
75%       81.0        3.0  6.889420e+05     NaN         NaN       0.0
max       81.0        3.0  7.288349e+07     NaN         NaN       0.0
In [10]:
import pandas as pd
import matplotlib.pyplot as plt


# Display basic statistics of numerical columns
numerical_summary = df_cleaned.describe()
numerical_summary
# Create a histogram for the 'VALUE' column
plt.figure(figsize=(8, 6))
n, bins, patches = plt.hist(df_cleaned['VALUE'], bins=20, color='skyblue')
plt.title('Distribution of VALUE')
plt.xlabel('VALUE')
plt.ylabel('Frequency')

# Initialize arrays to store values and coordinates
#value_array = []
#coordinates_array = []

# Annotate each bar with its frequency and coordinates (rotated 90 degrees)
for rect in patches:
    height = rect.get_height()
    x_coord = rect.get_x() + rect.get_width() / 2
    y_coord = height
    plt.annotate(f'({x_coord:.2f}, {y_coord:.2f})', xy=(x_coord, y_coord + 50), ha='center', va='bottom', rotation=90)
    
    # Append values and coordinates to respective arrays
    #value_array.append(height)
   # coordinates_array.append((x_coord, y_coord))

plt.show()

# Print the arrays
#print("Value Array:", value_array)
#print("Coordinates Array:", coordinates_array)
In [11]:
# Create a bar plot for the 'GEO' column with clear count labels
geo_counts = df_cleaned['GEO'].value_counts()
plt.figure(figsize=(10, 6))
ax = geo_counts.plot(kind='bar', color='lightcoral')
plt.title('Distribution of GEO')
plt.xlabel('GEO')
plt.ylabel('Count')

# Annotate the counts above the bars
for i, count in enumerate(geo_counts):
    ax.text(i, count + 20, str(count), ha='center', va='bottom', fontsize=9)

plt.xticks(rotation=90)
plt.tight_layout()  # Adjust plot layout for better visibility
plt.show()
In [12]:
import plotly.express as px

# Define the number of date labels to display (e.g., 10 labels)
num_labels = 25

# Calculate the step size to evenly select date labels
step = max(1, len(df['REF_DATE']) // num_labels)

# Create a subset of dates to display
selected_dates = df['REF_DATE'][::step]

# Create an interactive line plot using Plotly Express
fig = px.line(df, x='REF_DATE', y='VALUE', title='Sales Trends Over Time', labels={'REF_DATE': 'Date', 'VALUE': 'Sales (in Thousand Dollars)'})
fig.update_xaxes(tickvals=selected_dates, tickangle=45)
fig.show()
In [13]:
# Count unique values in categorical columns
categorical_columns = ['GEO', 'North American Industry Classification System (NAICS)', 'Adjustments']
for col in categorical_columns:
    unique_values = df_cleaned[col].value_counts()
    print(f'Unique values in {col}:')
    print(unique_values)
    print()
Unique values in GEO:
Canada                         4918
Ontario                        2459
Alberta                        2459
Montréal, Quebec               2457
British Columbia               2453
Quebec                         2452
Toronto, Ontario               2425
Manitoba                       2408
Saskatchewan                   2399
Vancouver, British Columbia    2380
New Brunswick                  2290
Nova Scotia                    2281
Newfoundland and Labrador      1999
Prince Edward Island           1702
Calgary, Alberta               1658
Quebec, Quebec                 1579
Winnipeg, Manitoba             1574
Gatineau, Quebec               1566
Edmonton, Alberta              1541
Ottawa, Ontario                1511
Northwest Territories           805
Yukon                           800
Nunavut                         491
Name: GEO, dtype: int64

Unique values in North American Industry Classification System (NAICS):
Retail trade [44-45]                                                                         3216
Motor vehicle and parts dealers [441]                                                        1724
Health and personal care retailers [456]                                                     1662
Food and beverage retailers [445]                                                            1656
Supermarkets and other grocery retailers (except convenience retailers) [44511]              1656
Gasoline stations and fuel vendors [457]                                                     1654
Sporting goods, hobby, musical instrument, book, and miscellaneous retailers [459]           1623
General merchandise retailers [455]                                                          1616
Electronics and appliances retailers [4492]                                                  1591
Furniture, home furnishings, electronics and appliances retailers [449]                      1589
Miscellaneous retailers [459B]                                                               1584
Automobile dealers [4411]                                                                    1575
Grocery and convenience retailers [4451]                                                     1556
Clothing, clothing accessories, shoes, jewelry, luggage and leather goods retailers [458]    1555
Convenience retailers and vending machine operators [44513]                                  1548
Furniture, floor covering, window treatment and other home furnishings retailers [4491]      1547
Automotive parts, accessories and tire retailers [4413]                                      1547
Other motor vehicle dealers [4412]                                                           1521
Building material and garden equipment and supplies dealers [444]                            1488
Furniture retailers [44911]                                                                  1476
New car dealers [44111]                                                                      1476
Sporting goods, hobby, musical instrument, book retailers and news dealers [459A]            1446
Clothing and clothing accessories retailers [4581]                                           1427
Floor covering, window treatment and other home furnishing retailers [44912]                 1416
Cannabis retailers [459993]                                                                  1273
Beer, wine and liquor retailers [4453]                                                       1271
Used car dealers [44112]                                                                     1263
Shoe retailers [4582]                                                                        1258
Jewellery, luggage and leather goods retailers [4583]                                        1237
Specialty food retailers [4452]                                                              1156
Name: North American Industry Classification System (NAICS), dtype: int64

Unique values in Adjustments:
Unadjusted             42868
Seasonally adjusted     3739
Name: Adjustments, dtype: int64

In [14]:
import pandas as pd

# Load the dataset from a CSV file
df = df_cleaned

# Initialize an empty DataFrame to store the comparison results
comparison_df = pd.DataFrame(columns=["REF_DATE", "GEO", "North American Industry Classification System (NAICS)", "Difference"])

# Get unique categories for which you want to compare values
categories = df["North American Industry Classification System (NAICS)"].unique()

# Loop through categories and compare "Unadjusted" and "Seasonally adjusted" values
for category in categories:
    category_data = df[df["North American Industry Classification System (NAICS)"] == category]
    unadjusted = category_data[category_data["Adjustments"] == "Unadjusted"]
    seasonally_adjusted = category_data[category_data["Adjustments"] == "Seasonally adjusted"]
    
    # Check if both "Unadjusted" and "Seasonally adjusted" data are available
    if len(unadjusted) > 0 and len(seasonally_adjusted) > 0:
        # Ensure the DataFrames have the same length before subtraction
        min_length = min(len(unadjusted), len(seasonally_adjusted))
        diff = seasonally_adjusted["VALUE"].values[:min_length] - unadjusted["VALUE"].values[:min_length]
        comparison_df = comparison_df.append({
            "REF_DATE": unadjusted["REF_DATE"].values[0],
            "GEO": unadjusted["GEO"].values[0],
            "North American Industry Classification System (NAICS)": category,
            "Difference": diff,
        }, ignore_index=True)

# Print the comparison results
print(comparison_df)
   REF_DATE     GEO North American Industry Classification System (NAICS)  \
0   2017-01  Canada                               Retail trade [44-45]      
1   2017-01  Canada              Motor vehicle and parts dealers [441]      
2   2017-01  Canada                          Automobile dealers [4411]      
3   2017-01  Canada                            New car dealers [44111]      
4   2017-01  Canada                           Used car dealers [44112]      
5   2017-01  Canada                 Other motor vehicle dealers [4412]      
6   2017-01  Canada  Automotive parts, accessories and tire retaile...      
7   2017-01  Canada  Building material and garden equipment and sup...      
8   2017-01  Canada                  Food and beverage retailers [445]      
9   2017-01  Canada           Grocery and convenience retailers [4451]      
10  2017-01  Canada  Supermarkets and other grocery retailers (exce...      
11  2017-01  Canada  Convenience retailers and vending machine oper...      
12  2017-01  Canada                    Specialty food retailers [4452]      
13  2017-01  Canada             Beer, wine and liquor retailers [4453]      
14  2017-01  Canada  Furniture, home furnishings, electronics and a...      
15  2017-01  Canada  Furniture, floor covering, window treatment an...      
16  2017-01  Canada                        Furniture retailers [44911]      
17  2017-01  Canada  Floor covering, window treatment and other hom...      
18  2017-01  Canada        Electronics and appliances retailers [4492]      
19  2017-01  Canada                General merchandise retailers [455]      
20  2017-01  Canada           Health and personal care retailers [456]      
21  2017-01  Canada           Gasoline stations and fuel vendors [457]      
22  2017-01  Canada  Clothing, clothing accessories, shoes, jewelry...      
23  2017-01  Canada  Clothing and clothing accessories retailers [4...      
24  2017-01  Canada                              Shoe retailers [4582]      
25  2017-01  Canada  Jewellery, luggage and leather goods retailers...      
26  2017-01  Canada  Sporting goods, hobby, musical instrument, boo...      
27  2017-01  Canada  Sporting goods, hobby, musical instrument, boo...      
28  2017-01  Canada                     Miscellaneous retailers [459B]      
29  2018-10  Canada                        Cannabis retailers [459993]      

                                           Difference  
0   [9040226.0, 126840.0, 182689.0, 45730.0, 26214...  
1   [3399086.0, 12926819.0, 13323656.0, 13272052.0...  
2   [2846376.0, 11355056.0, 11761008.0, 11613633.0...  
3   [2716176.0, 10436986.0, 10615666.0, 10732703.0...  
4   [130200.0, 918070.0, 928797.0, 956232.0, 71419...  
5   [366675.0, 833194.0, 801177.0, 873755.0, 67821...  
6   [186036.0, 738569.0, 761472.0, 782576.0, 79635...  
7   [913853.0, 2883769.0, 2841831.0, 3017570.0, 25...  
8   [1161041.0, 9997687.0, 10050652.0, 9877735.0, ...  
9   [488145.0, 7524109.0, 7582046.0, 7400919.0, 75...  
10  [402687.0, 6873402.0, 6932724.0, 6768383.0, 69...  
11  [85458.0, 650708.0, 649322.0, 632536.0, 635986...  
12  [126791.0, 474877.0, 521468.0, 464788.0, 56094...  
13  [546105.0, 1698622.0, 1780965.0, 1346124.0, 17...  
14  [422768.0, 3116812.0, 3135627.0, 3123924.0, 27...  
15  [222806.0, 1651598.0, 1597010.0, 1609602.0, 13...  
16  [122917.0, 1016750.0, 976866.0, 983560.0, 8114...  
17  [99888.0, 621255.0, 622389.0, 565396.0, 578461...  
18  [199962.0, 1461656.0, 1550633.0, 1509768.0, 15...  
19  [1101249.0, 5650483.0, 5824153.0, 5860496.0, 5...  
20  [178247.0, 3956670.0, 3967850.0, 3948422.0, 39...  
21  [590050.0, 5334490.0, 5238522.0, 5266784.0, 51...  
22  [828713.0, 2861514.0, 2768332.0, 2854327.0, 28...  
23  [618563.0, 2227376.0, 2157844.0, 2192824.0, 21...  
24  [110200.0, 351308.0, 327607.0, 361651.0, 29369...  
25  [99952.0, 282830.0, 277727.0, 301379.0, 289002...  
26  [445220.0, 2549047.0, 2549879.0, 2564029.0, 25...  
27  [200015.0, 1137805.0, 1144872.0, 1160480.0, 92...  
28  [245203.0, 1411243.0, 1374204.0, 1411103.0, 12...  
29  [0.0, 53251.0, 57997.0, 51177.0, 49890.0, 5083...  
C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\1648026756.py:23: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

In [15]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset from a CSV file
df = df_cleaned

# Initialize an empty DataFrame to store the summary statistics
summary_df = pd.DataFrame(columns=["Category", "Mean Difference", "Min Difference", "Max Difference"])

# Get unique categories for which you want to compare values
categories = df["North American Industry Classification System (NAICS)"].unique()

# Loop through categories and compare "Unadjusted" and "Seasonally adjusted" values
for category in categories:
    category_data = df[df["North American Industry Classification System (NAICS)"] == category]
    unadjusted = category_data[category_data["Adjustments"] == "Unadjusted"]
    seasonally_adjusted = category_data[category_data["Adjustments"] == "Seasonally adjusted"]
    
    # Check if both "Unadjusted" and "Seasonally adjusted" data are available
    if len(unadjusted) > 0 and len(seasonally_adjusted) > 0:
        # Ensure the DataFrames have the same length before subtraction
        min_length = min(len(unadjusted), len(seasonally_adjusted))
        diff = seasonally_adjusted["VALUE"].values[:min_length] - unadjusted["VALUE"].values[:min_length]
        # Calculate summary statistics
        mean_diff = diff.mean()
        min_diff = diff.min()
        max_diff = diff.max()
        summary_df = summary_df.append({
            "Category": category,
            "Mean Difference": mean_diff,
            "Min Difference": min_diff,
            "Max Difference": max_diff,
        }, ignore_index=True)

# Create a bar plot to visualize mean differences
plt.figure(figsize=(12, 6))
plt.bar(summary_df["Category"], summary_df["Mean Difference"])
plt.xlabel("NAICS Category")
plt.ylabel("Mean Difference (Seasonally Adjusted - Unadjusted)")
plt.title("Mean Differences Between Seasonally Adjusted and Unadjusted Data")
plt.xticks(rotation=90)
plt.tight_layout()

# Show the plot
plt.show()
C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:28: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3372784441.py:42: UserWarning:

Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations.

In [16]:
import matplotlib.pyplot as plt

# Calculate total sales for each sector
sector_sales = df_cleaned.groupby('North American Industry Classification System (NAICS)')['VALUE'].sum()

# Calculate the total sales across all sectors
total_sales = sector_sales.sum()

# Calculate the percentages
sector_percentages = (sector_sales / total_sales) * 100

# Sort the sectors by percentages in descending order
sector_percentages = sector_percentages.sort_values(ascending=False)

# Define a color palette for the bars
colors = plt.cm.viridis(sector_percentages / max(sector_percentages))

# Create a bar graph to visualize sales percentages by industry
plt.figure(figsize=(10, 8))
bars = plt.barh(sector_percentages.index, sector_percentages.values, color=colors)
plt.xlabel('Percentage of Total Sales', fontsize=14)
plt.ylabel('Industry', fontsize=14)
plt.title('Sales Percentages by Industry', fontsize=16)
plt.gca().invert_yaxis()

# Add labels to the bars
for sector, percentage in zip(sector_percentages.index, sector_percentages.values):
    plt.text(percentage + 1, sector, f'{percentage:.2f}%', va='center', fontsize=12)

plt.show()
In [17]:
import pandas as pd
import plotly.express as px


# Extract relevant columns
sales_data = df_cleaned[['REF_DATE', 'North American Industry Classification System (NAICS)', 'VALUE']]

# Convert REF_DATE to datetime
sales_data['REF_DATE'] = pd.to_datetime(sales_data['REF_DATE'])

# Group data by industry (North American Industry Classification System (NAICS)) and year
sales_by_industry = sales_data.groupby(['North American Industry Classification System (NAICS)', sales_data['REF_DATE'].dt.year])['VALUE'].sum().unstack().reset_index()

# Melt the data for use in Plotly Express
melted_data = pd.melt(sales_by_industry, id_vars='North American Industry Classification System (NAICS)', var_name='Year', value_name='Total Sales')

# Create an interactive line plot using Plotly Express
fig = px.line(
    melted_data,
    x='Year',
    y='Total Sales',
    color='North American Industry Classification System (NAICS)',
    labels={'Total Sales': 'Total Sales (in thousand dollars)'},
    title='Sales Trends by Industry'
)

fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Total Sales (in thousand dollars)',
    legend_title='Industry',
    xaxis_tickangle=-45,  # Rotate x-axis labels by 45 degrees
    width=1000,  # Increase the width of the plot
    height=1000,  # Increase the height of the plot
)
fig.show()
C:\Users\SAJJA\AppData\Local\Temp\ipykernel_21496\3411904461.py:9: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [18]:
# Group the data by the "North American Industry Classification System (NAICS)" column
industry_sales = df_cleaned.groupby("North American Industry Classification System (NAICS)")["VALUE"].sum()

# Convert the sales data to billions (divide by 1,000,000)
industry_sales_billion = industry_sales / 1_000_000

# Sort the industry sales data in descending order
industry_sales_billion = industry_sales_billion.sort_values(ascending=False)

# Display the sales data segmented by industry in billions
print(industry_sales_billion)
North American Industry Classification System (NAICS)
Retail trade [44-45]                                                                         22299.302008
Motor vehicle and parts dealers [441]                                                         4183.757364
Automobile dealers [4411]                                                                     3655.440550
New car dealers [44111]                                                                       3302.001521
Food and beverage retailers [445]                                                             3152.090122
Grocery and convenience retailers [4451]                                                      2372.693771
Supermarkets and other grocery retailers (except convenience retailers) [44511]               2183.357735
General merchandise retailers [455]                                                           1928.070526
Gasoline stations and fuel vendors [457]                                                      1577.571979
Health and personal care retailers [456]                                                      1247.231226
Building material and garden equipment and supplies dealers [444]                              961.486139
Furniture, home furnishings, electronics and appliances retailers [449]                        959.451792
Sporting goods, hobby, musical instrument, book, and miscellaneous retailers [459]             886.844768
Clothing, clothing accessories, shoes, jewelry, luggage and leather goods retailers [458]      852.435220
Clothing and clothing accessories retailers [4581]                                             657.609805
Beer, wine and liquor retailers [4453]                                                         550.403423
Miscellaneous retailers [459B]                                                                 542.210850
Furniture, floor covering, window treatment and other home furnishings retailers [4491]        502.519572
Electronics and appliances retailers [4492]                                                    457.181169
Sporting goods, hobby, musical instrument, book retailers and news dealers [459A]              339.998045
Used car dealers [44112]                                                                       335.497963
Furniture retailers [44911]                                                                    318.112023
Other motor vehicle dealers [4412]                                                             263.036074
Automotive parts, accessories and tire retailers [4413]                                        260.821328
Specialty food retailers [4452]                                                                209.560161
Convenience retailers and vending machine operators [44513]                                    192.304048
Floor covering, window treatment and other home furnishing retailers [44912]                   184.654946
Shoe retailers [4582]                                                                           95.100273
Jewellery, luggage and leather goods retailers [4583]                                           93.556810
Cannabis retailers [459993]                                                                     53.980520
Name: VALUE, dtype: float64
In [ ]: